#!/usr/bin/env perl

use strict;
use warnings;
use FindBin qw/$RealBin/;

#convert UChicago, GTEX eQTL data to BED format with name column containing the annotation
warn "NOTICE: run '$0 addbin' will output BED file with 1 column as BIN column\n";
warn "NOTICE: ANNOVAR will consider 1st column to be BIN by default\n";
sleep 1;

#how to insert the resulting BED file into enlight database?
#~/projects/annoenlight/bin/annodb_admin -bedinput -f ../hg19_GTEX-140712-0019-17897.Brain_Cerebellum.ponly.txt -bincol -col 4 enlight_hg19.db -i GTEX_Brain_Cerebellum

my $addbin=shift @ARGV;
{
    my $in="hg18_All.individual.tracks.gff.v3";
    my $hg18out_ponly="hg18_UChicago_eQTL.txt"; #p values only
    my $hg19out_ponly="hg19_UChicago_eQTL.txt"; #p values only
    my $hg18out="hg18_UChicago_eQTL.all.txt";
    my $hg19out="hg19_UChicago_eQTL.all.txt";
    my @all=($hg18out,$hg19out,$hg18out_ponly,$hg19out_ponly);

    #chromosomal name, 0-start or 1-start must be converted
    &convertUChicago2BED($in,$hg18out,$hg18out_ponly);

    !system("$RealBin/any_hg_convert hg19 bed numerical 0 $hg18out $hg18out_ponly") or die "Failed to convert coordinate\n";
    !system("mv -f ${hg18out}hg19 $hg19out") and !system("mv -f ${hg18out_ponly}hg19 $hg19out_ponly") or die "Failed to rename\n";

    if (defined $addbin && $addbin eq 'addbin')
    {
	&addBIN(@all);
    }

    warn "UChicago data writtent to ",join(",",@all),"\n";
}

{
    my $prefix="GTEX-140712";
    #input has filename: hg19_$prefix.tab
    #output BED has filename: hg19_$prefix.$id.bed
    my $in="hg19_$prefix.tab";
    my %dict=(
	1=>"Lymphoblastoid_RNAseq",
	2=>"Liver",
	3=>"Brain_Cerebellum",
	4=>"Brain_Frontal_Cortex",
	5=>"Brain_Temporal_Cortex",
	6=>"Brain_Pons",
	7=>"Lymphoblastoid_microarray",
    );
	my $liftover="liftOver";
	my $hg19to18chain="/home/yunfeiguo/Downloads/liftover/hg19ToHg18.over.chain";


    for my $i(keys %dict)
    {
    my $hg18out="hg18_$prefix.$dict{$i}.all.txt";
    my $hg19out="hg19_$prefix.$dict{$i}.all.txt";
    my $hg18out_ponly="hg18_$prefix.$dict{$i}.txt"; #only contains p values
    my $hg19out_ponly="hg19_$prefix.$dict{$i}.txt"; #only contains p values
    my @all=($hg18out,$hg19out,$hg18out_ponly,$hg19out_ponly);

	&convertGTEX2bed($in,$hg19out,$i);

	#convert hg19 to hg18
	my $hg19tmp_ucscchr="/tmp/$$.".rand($$).".withucscchr";
	my $hg18tmp_ucscchr="/tmp/$$.".rand($$).".zeroStart.tmp";

	#chromosomal name, 0-start or 1-start must be converted
	&chr2ucsc($hg19out,$hg19tmp_ucscchr);
	!system("mv -f $hg19tmp_ucscchr $hg19out") or die "Failed to rename: $!\n";

	!system("$liftover -bedPlus=3 -tab $hg19out $hg19to18chain $hg18out /tmp/$$.liftover.tmp ") or die "Failed to lift over from hg19 to hg18: $!\n";

	!system("perl -pe '\@a=split /\\t/;\$a[3]=~s/.*P:(.*?),.*/\$1/;\$_=join \"\\t\",\@a;' $hg18out > $hg18out_ponly") or die "Failed to extract p values: $!\n";
	!system("perl -pe '\@a=split /\\t/;\$a[3]=~s/.*P:(.*?),.*/\$1/;\$_=join \"\\t\",\@a;' $hg19out > $hg19out_ponly") or die "Failed to extract p values: $!\n";

	if (defined $addbin && $addbin eq 'addbin')
	{
	    &addBIN(@all);
	}
    warn "GTEX $dict{$i} data writtent to @all\n";
    }
}

#perl -e 'open IN,"<All.individual.tracks.gff.v3";my %chr;while(<IN>){@f=split /\t/;$chr{$f[0]}=1;} map {print "$_ "} keys %chr'


#######################SUBROUTINES###########################
sub convertUChicago2BED
{
#chr1	Degner2012_dsQTL	Degner_dsQTL	801099	801099	3.68973163336755	.	.	DegnerDSQTL "chr1.801099 a dsQTL for 802000 to 802100" ; Note "Acts in cis "; Plot=<a href="http://eqtl.uchicago.edu/dsQTL_data/FIGURES/caQTL_7_FEB_2012/DegnerDSQTL.chr1.801099.html" >See QTL plots and view/leave comments</a>
    my $in=shift;
    my $out=shift;
    my $ponly=shift;
    open IN,'<',$in or die "Failed to read from $in: $!\n";
    open OUT,'>',$out or die "Failed to write to $out: $!\n";
    open PONLY,'>',$ponly or die "Failed to write to $ponly: $!\n";

    while (<IN>)
    {
	next if /^#/;
	s/[\r\n]+$//;

	my @f=split /\t/;
	die "At least 9 fields expected: @f at line $. of $in\n" unless @f>=9;
	my ($chr,$start,$end,$score,$note)=@f[0,3,4,5,8];

	#chr7 chr23 chrNULL chr20 chr26 chr22 chr14 chr19 chr8 chr1 chr11 chr6 chr6_qbl_hap2 chr17 chr21 chr6_cox_hap1 chr16 chr25 chr18 chr3 chr12 chr15 chrX chr4 chr2 chr9 chr13 chr10 chr9_random chr5_h2_hap1 chr5
	#chr must be converted to numbers!
	if ($chr=~/^chr(\d+)$/)
	{
	    $chr=$1;
	} elsif ($chr=~/^chrX$/)
	{
	    $chr=23;
	} else
	{
	    next;
	}

	my $anno="$note,SCORE:$score";
	$anno=~s%Plot=.*</a>%%;
	$anno=~s/\s+/ /g;
	$anno=~s/[";]//g;
	$anno=~s/\s+$//;
	$start--; #0-start
	print OUT "$chr\t$start\t$end\t$anno\n";
	print PONLY "$chr\t$start\t$end\t$score\n";
    }
    close IN;
    close OUT;
    close PONLY;

    my $tmp="/tmp/$$.sorted.eqtl";
    !system("msort -k n1,n2 $out > $tmp") and !system("mv -f $tmp $out") or die "Failed to sort: $!\n";
    !system("msort -k n1,n2 $ponly > $tmp") and !system("mv -f $tmp $ponly") or die "Failed to sort: $!\n";
}
sub convertGTEX2bed
{
    my $in=shift;
    my $out=shift;
    my $target_id=shift;
    open IN,'<',$in or die "Failed to read $in: $!\n";
    open OUT,'>',$out or die "Failed to write to $out: $!\n";

#1	1	rs17843593	6	32615949	ENSE00000617155	6	32609087	HLA-DQA1	3117	NM_002122.3	0		major histocompatibility complex, class II, DQ alpha 1
#chromosomes: 11 21 7 17 2 22 1 18 13 16 6 X 3 9 12 20 14 15 8 4 19 10 5 
    while(<IN>)
    {
	next if /^#/;
	s/[\r\n]+$//;
	my @f=split /\t/,$_,-1;
	die "14 fields expected at $. of $in: @f\n" unless @f == 14;
	my ($id,$chr,$pos,$gene,$p,$desc)=@f[1,3,4,8,11,13];

	if (defined $target_id)
	{
	    next unless $id==$target_id;
	}
	next unless $chr && $pos;
	$chr=23 if $chr eq 'X';

	$desc=~s/[";\s]+$/ /g;
	$gene=~s/[";\s]+$/ /g;
	$p=~s/[";\s]+$/ /g;

	my $anno="Gene:$gene,P:$p,Description:$desc\n";
	$anno=~s/[";]//g;
	$anno=~s/\s+$/ /;
	my $start=$pos-1; #0-start
	print OUT "$chr\t$start\t$pos\t$anno\n";
    }
    close IN;
    close OUT;

    my $tmp="/tmp/$$.sorted.eqtl";
    !system("msort -k n1,n2 $out > $tmp") and !system("mv -f $tmp $out") or die "Failed to sort: $!\n";
}

sub chr2ucsc
{
    my $in=shift;
    my $out=shift;

    open IN,'<',$in or die "Failed to read $in: $!\n";
    open OUT,'>', $out or die "Failed to write to $out: $!\n";

    while(<IN>)
    {
	if (/^(\d+)\t(\d+)\t(\d+)/)
	{
	    my $chr=$1;
	    if ($chr == 23)
	    {
		s/^\d+/chrX/;
	    } else
	    {
		s/^\d+/chr$chr/;
	    }
	}
	print OUT;
    }

    close IN;
    close OUT;
}

sub convertAnno2Exist
{
    my %hash=%{shift @_};

    while (my ($in,$out)=each %hash)
    {
	open IN,'<',$in or die "Failed to read $in: $!\n";
	open OUT,'>',$out or die "Failed to write to $out: $!\n";

	while(<IN>)
	{
	    chomp;
	    next unless /^[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+/;
	    s/^([^\t]+\t[^\t]+\t[^\t]+\t)[^\t]+/$1y/;
	    s/$/\n/;
	    print OUT;
	}
	close IN;
	close OUT;
    }
}

    sub addBIN
    {
	for my $file(@_)
	{
	    my $tmp="/tmp/$$".rand($$)."bin.tmp";
	    open IN,'<',$file or die "Failed to read $_: $!\n";
	    open OUT,'>',$tmp or die "failed to write to $tmp: $!\n";
	    while(<IN>)
	    {
		s/^/1\t/;
		print OUT;
	    }
	    close IN;
	    close OUT;
	    !system("mv -f $tmp $file") or die "Failed to rename: $!\n";
	}
    }
#sub ucsc2chr
#{
#    my $in=shift;
#    my $out=shift;
#
#    open IN,'<',$in or die "Failed to read $in: $!\n";
#    open OUT,'>', $out or die "Failed to write to $out: $!\n";
#
#    while(<IN>)
#    {
#	if (/^chr(\d+|X)\t(\d+)\t(\d+)/)
#	{
#	    my $start=$2+1;
#	    my $chr=$1;
#	    if ($chr eq 'X')
#	    {
#		s/^chrX\t\d+/23\t$start/;
#	    } else
#	    {
#		s/^chr\d+\t\d+/$chr\t$start/;
#	    }
#	}
#	print OUT;
#    }
#
#    close IN;
#    close OUT;
#}
